##### This script plots Figures 1 and 2.

##################################################
# LOAD DATA AND PACKAGES
##################################################

library(readstata13) # for reading Stata data

library(haven) # for reading Shiro's stata data (CES cumulative file)

library(questionr) # for creating weighted tables

### LOAD ANES

raw <- read.dta13('../data/anes_timeseries_cdf_dta/anes_timeseries_cdf_stata13.dta', convert.factors=F) # NES cumulative file

d <- raw

### LOAD GSS

# gss <- read.csv('../data/pid-rates-gss.csv') # Data entered manually from the GSS into a CSV file.

gss <- read.dta13('../data/GSS_stata/gss7224_r1.dta')

### LOAD CES

ces <- read_dta('../data/ces/cumulative_2006-2024.dta')

##################################################
# GRAPHIC OF PID OVER TIME, FULL ANES PID3=IND, THEN ANES PID7=4 BY MODE
##################################################

### Variable selection from ANES

ot <- subset(d, select=c("VCF0004", "VCF0017", "VCF0301", "VCF0009x", "VCF0009y"))
names(ot) <- c("year", "mode", "pid7", "wt", "wtweb")

### Compute shares of PID7=4s by year, by mode

# separate internet from non-internet respondents

face <- ot[ot$mode!=4,]
net <- ot[ot$mode==4,]

# split data into lists

facesplit <- split(face, f=face$year)
netsplit <- split(net, f=net$year)

# proportions of PID by year

pidface <- lapply(facesplit, function(x) prop.table(wtd.table(x$pid7, weights=x$wt)))

pidnet <- lapply(netsplit, function(x) prop.table(wtd.table(x$pid7, weights=x$wtweb)))

theyear <- as.numeric(names(pidface))
theyear2 <- names(pidnet)
facepid <- 100*unlist(lapply(pidface, function(x) x[4]))
netpid <- 100*unlist(lapply(pidnet, function(x) x[4]))

### Compute share of independents (including leaners) by year, regardless of mode

# recode the leaners

ot2 <- ot

table(ot2$pid7)

ot2$pid3 <- ot2$pid7

ot2$pid3[ot2$pid3 %in% c(1,2)] <- 1
ot2$pid3[ot2$pid3 %in% c(3:5)] <- 2
ot2$pid3[ot2$pid3 %in% c(6,7)] <- 3

# split data into lists

ot2split <- split(ot2, f=ot2$year)

# get proportion of pid3=ind by year

pidall <- lapply(ot2split, function(x) prop.table(wtd.table(x$pid3, weights=x$wt)))

allpid <- 100*unlist(lapply(pidall, function(x) x[2]))

allyear <- as.numeric(names(pidall))

### Make the plot

pdf('../output/fig1_1952-2016_anes-only.pdf', width=8, height=5)

# ANES data (all, PID3=2)

plot(allyear, allpid, type='l', main="", xlab="", ylab="", axes=F, lwd=2, xlim=c(1948, 2020), ylim=c(0, 50))
text(allyear[5], allpid[5], pos=1, label="Ind. with leaners")

# ANES data (non-internet)
lines(theyear, facepid, lwd=2)
# points(theyear, facepid, pch=16)
text(theyear[3], facepid[2], pos=1, label='Pure (LI)', cex=1)

# ANES data (internet)
lines(theyear2, netpid, lwd=2, lty=1)
# points(theyear2, netpid, pch=16)
text(theyear2[2], netpid[2], pos=3, label='Pure (SA)', cex=1)

axis(1, tick=F, las=2, at=seq(1948, 2016, 4))
axis(2, tick=F, las=2)

dev.off()

# plot -- all

##################################################
# PLOT TRENDS FROM GSS, CES, AND ANES -- PURE INDS
##################################################

### Compute GSS share of pure independent by year

gss.sub <- subset(gss, select=c("year", "wtssps", "partyid"))

table(gss.sub$partyid) # inspect variable values
str(gss.sub) # is partyid a factor?

gss.sub$pid7 <- rep(NA, nrow(gss.sub))
gss.sub$pid7[gss.sub$partyid=="strong democrat"] <- 1
gss.sub$pid7[gss.sub$partyid=="not very strong democrat"] <- 2
gss.sub$pid7[gss.sub$partyid=="independent, close to democrat"] <- 3
gss.sub$pid7[gss.sub$partyid=="independent (neither, no response)"] <- 4
gss.sub$pid7[gss.sub$partyid=="independent, close to republican"] <- 5
gss.sub$pid7[gss.sub$partyid=="not very strong republican"] <- 6
gss.sub$pid7[gss.sub$partyid=="strong republican"] <- 7

# split by year

gss.split <- split(gss.sub, gss.sub$year)

pidall.gss <- lapply(gss.split, function(x) prop.table(wtd.table(x$pid7, weights=x$wtspss)))

# GSS PID=4 over time

gsspid4 <- unlist(lapply(pidall.gss, '[[', 4))

gsspid4.2 <- cbind(data.frame("year"=names(gsspid4)), "pure"=100*gsspid4)

gsspid4.3 <- gsspid4.2[gsspid4.2$year <= 2016,]

### Compute CES share of pure independent by year

ces.sub <- as.data.frame(subset(ces, select=c("weight_cumulative", "year", "pid7")))

ces.sub$pid7 <- as.numeric(ces.sub$pid7)

table(ces.sub$pid7, exclude=F) # inspect

ces.sub$pid7[ces.sub$pid7==8] <- NA # recode "not sure"

ces.sub$pid7[ces.sub$pid7==9] <- NA # recode "don't know"

# split by year

ces.split <- split(ces.sub, ces.sub$year)

pidall.ces <- lapply(ces.split, function(x) prop.table(wtd.table(x$pid7, weights=x$weight_cumulative)))

# CES PID=4 over time

cespid4 <- unlist(lapply(pidall.ces, '[[', 4))

cespid4.2 <- cbind(data.frame("year"=names(cespid4)), "pure"=100*cespid4)

cespid4.3 <- cespid4.2[cespid4.2$year <= 2016,]

### Plot trends

pdf('../output/fig2_1952-2016_anes_gss_ces.pdf', width=8, height=5)

# ANES data (non-internet)
plot(theyear, facepid, type='l', main="", xlab="", ylab="", axes=F, lwd=2, xlim=c(1948, 2020), ylim=c(4, 25)) # Percent (weighted) of 'pure' independents\nin the ANES, CES, and GSS
# points(theyear, facepid, pch=16)
text(theyear[3], facepid[2], pos=1, label='ANES (LI)')

# ANES data (internet)
lines(theyear2, netpid, lwd=2, lty=1)
# points(theyear2, netpid, pch=16)
text(theyear2[2], netpid[2], pos=4, label='ANES\n(SA)')

# CES data
lines(cespid4.3$year, cespid4.3$pure, lty=3, lwd=2)
# points(schaff$year, schaff$val, pch=1)
text(cespid4.3$year[1], cespid4.3$pure[2], pos=1, label='CES\n(SA)') #pure[1], also \n

# GSS data
lines(gsspid4.3$year, gsspid4.3$pure)
text(gsspid4.3$year[2], gsspid4.3$pure[2], pos=1, label='GSS (LI)')

axis(1, tick=F, las=2, at=seq(1948, 2016, 4))
axis(2, tick=F, las=2)
# text(1962, 14, "ANES\nnon-Internet")
# text(2016, 16.5, pos=1, "ANES\nInternet")
# text(2016, 16, "Internet")
# text(2008, 16, "CCES", pos=2)

# legend('top', bty='n', lty=c(1,3,1), lwd=c(2, 2, 1), legend=c("ANES", "CCES", "GSS"))

dev.off()